R setup

# Load libraries
library(plyr)
library(tidyverse)
library(edgeR)
library(AnnotationHub)
library(magrittr)
library(scales)
library(pander)
library(ggrepel)
library(fgsea)
library(pheatmap)
library(igraph)
library(tidygraph)
library(ggraph)
library(grid)
library(RUVSeq)
library(limma)
# Set working directory
setwd("~/Documents/Bioinformatics/Projects/20190122_Q96K97_NoStress_RNASeq/R")

2019 data analysis

Differential Expression Analysis

Load data

# Load counts analysed by feature counts
counts <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  dplyr::select(Geneid, starts_with("W"), starts_with("Q"))

Create DGEList

# Create DGEList and calculate normalisaton factors
dgeList <- counts %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
# Set group variable
dgeList$samples$group <- colnames(dgeList) %>%
  str_extract("(W|Q)") %>%
  factor(levels = c("W", "Q"))

Add gene information

# Add AnnotationHub and subset to search for zebrafish
ah <- AnnotationHub()
ah %>%
  subset(species == "Danio rerio") %>%
  subset(dataprovider == "Ensembl") %>%
  subset(rdataclass == "EnsDb")
## AnnotationHub with 11 records
## # snapshotDate(): 2019-05-02 
## # $dataprovider: Ensembl
## # $species: Danio rerio
## # $rdataclass: EnsDb
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## #   tags, rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH53201"]]' 
## 
##             title                           
##   AH53201 | Ensembl 87 EnsDb for Danio Rerio
##   AH53705 | Ensembl 88 EnsDb for Danio Rerio
##   AH56671 | Ensembl 89 EnsDb for Danio Rerio
##   AH57746 | Ensembl 90 EnsDb for Danio Rerio
##   AH60762 | Ensembl 91 EnsDb for Danio Rerio
##   ...       ...                             
##   AH64434 | Ensembl 93 EnsDb for Danio Rerio
##   AH64906 | Ensembl 94 EnsDb for Danio rerio
##   AH67932 | Ensembl 95 EnsDb for Danio rerio
##   AH69169 | Ensembl 96 EnsDb for Danio rerio
##   AH73861 | Ensembl 97 EnsDb for Danio rerio
# Select correct Ensembl release
ensDb <- ah[["AH64906"]]
# Extract GenomicRanges object from ensDb
genesGR <- genes(ensDb)
# Remove redundant columns from mcols
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name", 
                                   "gene_biotype", "entrezid")]
transGR <- transcripts(ensDb)
mcols(transGR) <- mcols(transGR)[c("tx_id", "gene_id")]
trans2Gene <- tibble(
  tx_id = transGR$tx_id,
  gene_id = transGR$gene_id
)
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeList$genes <- genesGR[rownames(dgeList),]

Data QC

# Perform logical test to see how many genes were not detected in dataset
dgeList$counts %>%
  rowSums() %>%
  is_greater_than(0) %>%
  table()
## .
## FALSE  TRUE 
##  3927 28130
# Check for genes having > 4 samples with cpm > 1
dgeList %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4) %>%
  table()
## .
## FALSE  TRUE 
## 13704 18353
# Create logical vector of genes to keep that fit criteria
genes2keep <- dgeList %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeList %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "Before Filtering")
dgeFilt %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "After Filtering")

par(mfrow = c(1,1))

Library sizes

# Check library sizes with box plot
dgeFilt$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library Size") +
  scale_fill_discrete(
    name ="Genotype", 
    labels = c("Wildtype", "Mutant")
  ) +
  scale_x_discrete(labels=c("W" = "Wildtype", "Q" = "Mutant")) +
  theme_bw()

PCA

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pca <- dgeFilt %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 22.27 18.07 16.75 14.73 14.45 13.34 11.87 11.2 5.671e-14
Proportion of Variance 0.2513 0.1655 0.1421 0.1099 0.1058 0.09023 0.07145 0.06362 0
Cumulative Proportion 0.2513 0.4168 0.559 0.6689 0.7747 0.8649 0.9364 1 1
# Plot PCA
pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFilt$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

Differential expression

# Create model matrix
design <- model.matrix(~group, data = dgeFilt$samples)
# Perform exact test on DGEList
topTable <- dgeFilt %>%
  estimateDisp(design = design) %>%
  exactTest() %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", ID.start, ID.end, sep = "-") %>%
  unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
  dplyr::select(
    Geneid = ID.gene_id, 
    Symbol = ID.gene_name,
    AveExpr = logCPM, logFC, 
    P.Value = PValue, 
    FDR, Location, 
    Entrez = ID.entrezid
  ) %>%
  mutate(DE = FDR < 0.05)
# Volcano plot showing DE genes
topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(data = . %>% 
                    dplyr::filter(DE) %>%
                    dplyr::filter(-log10(P.Value) > 4 | abs(logFC) >                        2.5), aes(label = Symbol)) + 
  scale_colour_manual(values = c("grey", "red")) +
  theme_bw() +
  theme(legend.position = "none")

# MD Plot showing DE genes
topTable %>%
  dplyr::arrange(desc(P.Value)) %>%
  ggplot(aes(AveExpr, logFC, colour = DE)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(
    data = . %>% 
      dplyr::filter(DE) %>%
      dplyr::filter(abs(logFC) > 2 | AveExpr > 14),
    aes(label = Symbol)
  ) + 
  scale_colour_manual(values = c("grey", "red")) +
  labs(x = "Average Expression (log2 CPM)",
       y = "log Fold-Change") +
  theme_bw() +
  theme(legend.position = "none")

# Summary of DE genes
topTableDE <- topTable %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) 
topTableDE %>% pander(style = "rmarkdown", split.tables = Inf)
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000091368 AL954327.1 2.656 -5.923 1.214e-08 0.0002228
ENSDARG00000093214 si:ch211-284e13.9 0.8189 1.542 6.086e-07 0.005585
ENSDARG00000037421 egr1 8.527 -0.712 9.887e-07 0.006049
ENSDARG00000017246 prx 2.326 -2.614 1.823e-06 0.008364
ENSDARG00000089477 si:ch211-132g1.3 5.855 0.6079 3.135e-06 0.01089
ENSDARG00000089382 zgc:158463 5.631 0.6536 3.561e-06 0.01089
ENSDARG00000080337 NC_002333.4 11.17 0.4449 5.344e-06 0.01401
ENSDARG00000096829 blvrb 3.025 -1.521 1.643e-05 0.03386
ENSDARG00000093438 CU467110.1 4.596 0.5459 1.752e-05 0.03386
ENSDARG00000091916 ugt5b4 -0.0384 -1.445 1.994e-05 0.03386
ENSDARG00000036304 dnaaf3l 1.358 -1.44 2.029e-05 0.03386

GO Enrichment

ens2Entrez <- file.path("https://uofabioinformaticshub.github.io/Intro-NGS-fib",
                        "data", "ens2Entrez.tsv") %>% 
  url() %>%
  read_tsv()
de <- topTable %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid) %>%
  left_join(ens2Entrez) %>%
  dplyr::filter(!is.na(Entrez)) %>%
  .[["Entrez"]] %>%
  unique()
uv <- topTable %>%
  dplyr::select(Geneid) %>%
  left_join(ens2Entrez) %>%
  dplyr::filter(!is.na(Entrez)) %>%
  .[["Entrez"]] %>%
  unique()
goResults <- goana(de = de, universe = uv, species = "Hs")
goResults %>% 
  rownames_to_column("GO ID") %>%
  as_tibble() %>%
  dplyr::filter(DE > 1) %>%
  dplyr::arrange(P.DE) %>%
  mutate(FDR = p.adjust(P.DE, "fdr")) %>%
  dplyr::filter(FDR < 0.05) %>%
  mutate(`GO ID` = str_replace(`GO ID`, ":", "\\\\:")) %>%
  pander(caption = "GO Terms potentially enriched in the set of differentially expressed genes")
GO Terms potentially enriched in the set of differentially expressed genes
GO ID Term Ont N DE P.DE FDR

Gene Set Enrichment Analysis (GSEA)

Setting up ID conversion, ranks and pathways

# Load id conversion file
idConvert <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
  dplyr::select(Geneid = zfID, EntrezID = Entrez) %>%
  mutate(EntrezID = as.character(EntrezID))
# Create function to convert ids (Not sure how this works, Steve wrote it)
convertHsEG2Dr <- function(ids, df = idConvert){
  dplyr::filter(df, EntrezID %in% ids)$Geneid
}
# Conversion of zebrafish ensembl ID to zebrafish symbol, for plotting on network analyses
idConvertSymbol <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
  dplyr::select(label = zfID, symbol = zfName) %>%
  na.omit() %>%
  unique()
# Create named vector of gene level statistics 
ranks <- topTable %>%
  mutate(stat = -sign(logFC) * log10(P.Value)) %>%
  dplyr::arrange(desc(stat)) %>%
  with(structure(stat, names = Geneid))
# Import hallmark human gene genesets and tidy gene set names
# .gmt files downloaded from:
# http://software.broadinstitute.org/gsea/downloads.jsp 
# http://data.wikipathways.org/20190610/ 
hallmark <- gmtPathways("../files/h.all.v6.2.entrez.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "HALLMARK_"))
kegg <- gmtPathways("../files/c2.cp.kegg.v6.2.entrez.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "KEGG_"))
wiki <- gmtPathways("../files/wikipathways-20190610-gmt-Homo_sapiens.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "%.+"))

Hallmark

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaHallmark <- fgsea(hallmark, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaHallmarkTop <- fgseaHallmark %>%
  dplyr::filter(padj < 0.05)
fgseaHallmarkTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Hallmark pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 13 most significantly enriched Hallmark pathways. This corresponds to an FDR of 0.284%
pathway pval FDR ES NES size padj
MYC_TARGETS_V1 1.236e-05 0.0001011 0.6076 2.418 220 0.0006178
OXIDATIVE_PHOSPHORYLATION 1.237e-05 0.0001011 0.6488 2.58 219 0.0006184
INTERFERON_GAMMA_RESPONSE 1.251e-05 0.0001011 0.5684 2.242 200 0.0006256
E2F_TARGETS 1.254e-05 0.0001011 0.464 1.827 197 0.0006268
ALLOGRAFT_REJECTION 1.256e-05 0.0001011 0.5472 2.153 195 0.0006278
DNA_REPAIR 1.304e-05 0.0001011 0.5417 2.069 150 0.0006519
INTERFERON_ALPHA_RESPONSE 1.415e-05 0.0001011 0.5773 2.034 83 0.0007075
MYOGENESIS 5.222e-05 0.0003264 -0.4095 -1.859 216 0.002611
WNT_BETA_CATENIN_SIGNALING 6.038e-05 0.0003354 -0.5614 -2.034 52 0.003019
ADIPOGENESIS 0.0001853 0.0008935 0.4063 1.617 220 0.009267
MTORC1_SIGNALING 0.0001966 0.0008935 0.4066 1.624 228 0.009829
G2M_CHECKPOINT 0.0005689 0.002371 0.3955 1.572 216 0.02845
KRAS_SIGNALING_DN 0.0007387 0.002841 -0.3669 -1.599 155 0.03694
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  hallmark[fgseaHallmarkTop$pathway], ranks, fgseaHallmark, gseaParam = 0.5
)

KEGG

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for KEGG
fgseaKEGG <- fgsea(kegg, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaKEGGTop <- fgseaKEGG %>%
  dplyr::filter(padj < 0.05)
fgseaKEGGTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched KEGG pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 29 most significantly enriched KEGG pathways. This corresponds to an FDR of 0.171%
pathway pval FDR ES NES size padj
CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 1.207e-05 0.0002836 0.4479 1.812 260 0.002246
CHEMOKINE_SIGNALING_PATHWAY 1.268e-05 0.0002836 0.6442 2.524 187 0.002358
HUNTINGTONS_DISEASE 1.276e-05 0.0002836 0.5849 2.281 180 0.002373
ALZHEIMERS_DISEASE 1.293e-05 0.0002836 0.555 2.141 164 0.002406
SPLICEOSOME 1.333e-05 0.0002836 0.6179 2.32 131 0.002479
OXIDATIVE_PHOSPHORYLATION 1.337e-05 0.0002836 0.7172 2.685 128 0.002486
PARKINSONS_DISEASE 1.344e-05 0.0002836 0.7061 2.63 123 0.002501
RIBOSOME 1.426e-05 0.0002836 0.8662 3.032 79 0.002653
NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY 1.468e-05 0.0002836 0.6316 2.126 62 0.00273
PROTEASOME 1.525e-05 0.0002836 0.7112 2.259 45 0.002836
LINOLEIC_ACID_METABOLISM 2.872e-05 0.0004856 -0.6266 -2.178 43 0.005342
ECM_RECEPTOR_INTERACTION 3.345e-05 0.0005089 -0.5644 -2.212 79 0.006222
ASCORBATE_AND_ALDARATE_METABOLISM 4.629e-05 0.0005089 -0.7435 -3.304 181 0.00861
PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 4.688e-05 0.0005089 -0.7556 -3.364 184 0.008719
STARCH_AND_SUCROSE_METABOLISM 4.85e-05 0.0005089 -0.7369 -3.307 196 0.00902
PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 4.877e-05 0.0005089 -0.7057 -3.175 199 0.009071
STEROID_HORMONE_BIOSYNTHESIS 5.1e-05 0.0005089 -0.695 -3.152 213 0.009486
FOCAL_ADHESION 5.198e-05 0.0005089 -0.4533 -2.062 219 0.009669
DRUG_METABOLISM_OTHER_ENZYMES 5.222e-05 0.0005089 -0.6915 -3.146 220 0.009714
DRUG_METABOLISM_CYTOCHROME_P450 5.856e-05 0.0005089 -0.6977 -3.24 262 0.01089
RETINOL_METABOLISM 5.896e-05 0.0005089 -0.6954 -3.231 264 0.01097
METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 6.019e-05 0.0005089 -0.6907 -3.216 270 0.01119
PATHWAYS_IN_CANCER 7.027e-05 0.0005683 -0.3296 -1.568 334 0.01307
SYSTEMIC_LUPUS_ERYTHEMATOSUS 7.645e-05 0.0005925 0.61 1.93 44 0.01422
AXON_GUIDANCE 0.0001317 0.0009798 -0.3794 -1.663 162 0.0245
JAK_STAT_SIGNALING_PATHWAY 0.0001705 0.00122 -0.3899 -1.693 151 0.03172
ARRHYTHMOGENIC_RIGHT_VENTRICULAR_CARDIOMYOPATHY_ARVC 0.0002086 0.001437 -0.4541 -1.817 89 0.03881
PATHOGENIC_ESCHERICHIA_COLI_INFECTION 0.000225 0.001495 0.5804 1.894 52 0.04185
NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION 0.0002659 0.001706 -0.3377 -1.544 228 0.04946
# Make a table plot of significant KEGG pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  kegg[fgseaKEGGTop$pathway], ranks, fgseaKEGG, gseaParam = 0.5
)

WikiPathways

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for WikiPathways
fgseaWiki <- fgsea(wiki, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaWikiTop <- fgseaWiki %>%
  dplyr::filter(padj < 0.05)
fgseaWikiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Wiki pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 24 most significantly enriched Wiki pathways. This corresponds to an FDR of 0.181%
pathway pval FDR ES NES size padj
Chemokine signaling pathway 1.284e-05 0.0009715 0.5945 2.303 170 0.006691
Nonalcoholic fatty liver disease 1.299e-05 0.0009715 0.5834 2.239 156 0.006768
mRNA Processing 1.328e-05 0.0009715 0.5371 2.022 133 0.006919
Electron Transport Chain (OXPHOS system in mitochondria) 1.382e-05 0.0009715 0.7417 2.677 97 0.007198
Cytoplasmic Ribosomal Proteins 1.416e-05 0.0009715 0.8704 3.052 80 0.007377
Parkin-Ubiquitin Proteasomal System pathway 1.436e-05 0.0009715 0.6106 2.101 71 0.007483
Oxidative phosphorylation 1.466e-05 0.0009715 0.7527 2.517 60 0.007635
Mitochondrial complex I assembly model OXPHOS system 1.492e-05 0.0009715 0.7502 2.444 52 0.007772
Nicotine Metabolism 2.591e-05 0.001041 -0.768 -2.255 21 0.0135
Striated Muscle Contraction Pathway 2.788e-05 0.001041 -0.6834 -2.256 34 0.01452
Irinotecan Pathway 2.848e-05 0.001041 -0.7029 -2.396 39 0.01484
Proteasome Degradation 2.926e-05 0.001041 0.5898 1.979 61 0.01525
Estrogen metabolism 3.135e-05 0.001041 -0.645 -2.398 59 0.01633
Pregnane X Receptor pathway 3.404e-05 0.001041 -0.6214 -2.444 80 0.01773
Constitutive Androstane Receptor Pathway 3.417e-05 0.001041 -0.6225 -2.452 81 0.0178
Tamoxifen metabolism 3.485e-05 0.001041 -0.6519 -2.594 86 0.01816
Aryl Hydrocarbon Receptor Pathway 3.544e-05 0.001041 -0.6158 -2.472 91 0.01846
Codeine and Morphine Metabolism 3.598e-05 0.001041 -0.7161 -2.895 95 0.01875
TGF-beta Signaling Pathway 4.232e-05 0.00116 -0.4216 -1.822 148 0.02205
Glucuronidation 4.759e-05 0.001205 -0.7632 -3.401 187 0.02479
Focal Adhesion 5.09e-05 0.001205 -0.4598 -2.085 213 0.02652
NRF2 pathway 5.09e-05 0.001205 -0.4504 -2.043 213 0.02652
Metapathway biotransformation Phase I and II 7.519e-05 0.001703 -0.5239 -2.51 360 0.03918
Nuclear Receptors Meta-Pathway 8.361e-05 0.001815 -0.3388 -1.64 404 0.04356
# Make a table plot of significant WikiPathways pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  wiki[fgseaWikiTop$pathway], ranks, fgseaWiki, gseaParam = 0.5
)

Fry

Hallmark

# Create design matrix
design <- model.matrix(~group, data = dgeFilt$samples)
# Create voom object
voom <- voom(dgeFilt, design)
# Create index for fry
fryIndex <- ids2indices(hallmark, rownames(dgeFilt$counts))
# Run fry
fry(voom, index = fryIndex, design = design) %>%
  rownames_to_column("pathway") %>%
  dplyr::arrange(PValue.Mixed) %>%
  as.data.frame() %>%
  column_to_rownames("pathway") 
##                                   NGenes Direction     PValue       FDR
## ALLOGRAFT_REJECTION                  162        Up 0.07596489 0.4567829
## INTERFERON_GAMMA_RESPONSE            174        Up 0.11167661 0.4567829
## INTERFERON_ALPHA_RESPONSE             74        Up 0.16768009 0.4567829
## APICAL_SURFACE                        50      Down 0.50885615 0.6876434
## INFLAMMATORY_RESPONSE                160      Down 0.56905689 0.6939718
## TNFA_SIGNALING_VIA_NFKB              185        Up 0.37021201 0.6309383
## WNT_BETA_CATENIN_SIGNALING            48      Down 0.02324950 0.4567829
## NOTCH_SIGNALING                       38        Up 0.24181096 0.4836219
## COMPLEMENT                           191        Up 0.44013368 0.6381304
## IL2_STAT5_SIGNALING                  198        Up 0.39742195 0.6309383
## P53_PATHWAY                          211        Up 0.39066425 0.6309383
## E2F_TARGETS                          197        Up 0.16026231 0.4567829
## IL6_JAK_STAT3_SIGNALING               68        Up 0.40380054 0.6309383
## FATTY_ACID_METABOLISM                177        Up 0.07310109 0.4567829
## KRAS_SIGNALING_UP                    198      Down 0.80779747 0.8764850
## DNA_REPAIR                           150        Up 0.07456667 0.4567829
## MYC_TARGETS_V1                       220        Up 0.17316997 0.4567829
## SPERMATOGENESIS                      113        Up 0.29564563 0.5474919
## HEME_METABOLISM                      183        Up 0.96210162 0.9621016
## APOPTOSIS                            153        Up 0.22395939 0.4812092
## ADIPOGENESIS                         220        Up 0.13806564 0.4567829
## G2M_CHECKPOINT                       216        Up 0.19928842 0.4744962
## OXIDATIVE_PHOSPHORYLATION            219        Up 0.17907727 0.4567829
## KRAS_SIGNALING_DN                    154      Down 0.12383031 0.4567829
## MYOGENESIS                           215      Down 0.16803951 0.4567829
## APICAL_JUNCTION                      210      Down 0.13454047 0.4567829
## UV_RESPONSE_UP                       171        Up 0.43866048 0.6381304
## UNFOLDED_PROTEIN_RESPONSE            125        Up 0.36669753 0.6309383
## COAGULATION                          109        Up 0.68906753 0.8012413
## EPITHELIAL_MESENCHYMAL_TRANSITION    212      Down 0.17710119 0.4567829
## XENOBIOTIC_METABOLISM                192        Up 0.82389592 0.8764850
## GLYCOLYSIS                           215        Up 0.12093542 0.4567829
## PROTEIN_SECRETION                    101      Down 0.63381732 0.7545444
## MITOTIC_SPINDLE                      245      Down 0.54282931 0.6939718
## PEROXISOME                           119        Up 0.48829625 0.6781892
## PI3K_AKT_MTOR_SIGNALING              120      Down 0.54360742 0.6939718
## ESTROGEN_RESPONSE_LATE               195        Up 0.44669131 0.6381304
## REACTIVE_OXIGEN_SPECIES_PATHWAY       49        Up 0.23098040 0.4812092
## HYPOXIA                              212      Down 0.55836872 0.6939718
## BILE_ACID_METABOLISM                 106      Down 0.86005207 0.8776042
## MTORC1_SIGNALING                     225        Up 0.26835620 0.5160696
## UV_RESPONSE_DN                       165      Down 0.18271314 0.4567829
## ANGIOGENESIS                          36      Down 0.02050403 0.4567829
## ANDROGEN_RESPONSE                    103        Up 0.74186501 0.8242945
## ESTROGEN_RESPONSE_EARLY              196      Down 0.21886178 0.4812092
## MYC_TARGETS_V2                        58        Up 0.71131129 0.8083083
## PANCREAS_BETA_CELLS                   37        Up 0.03061771 0.4567829
## TGF_BETA_SIGNALING                    58      Down 0.17484418 0.4567829
## HEDGEHOG_SIGNALING                    44      Down 0.15130253 0.4567829
## CHOLESTEROL_HOMEOSTASIS               81      Down 0.84782517 0.8776042
##                                   PValue.Mixed FDR.Mixed
## ALLOGRAFT_REJECTION                 0.01848628 0.3566740
## INTERFERON_GAMMA_RESPONSE           0.03181392 0.3566740
## INTERFERON_ALPHA_RESPONSE           0.04321697 0.3566740
## APICAL_SURFACE                      0.04780344 0.3566740
## INFLAMMATORY_RESPONSE               0.07494033 0.3566740
## TNFA_SIGNALING_VIA_NFKB             0.07518999 0.3566740
## WNT_BETA_CATENIN_SIGNALING          0.09854460 0.3566740
## NOTCH_SIGNALING                     0.10005755 0.3566740
## COMPLEMENT                          0.10096802 0.3566740
## IL2_STAT5_SIGNALING                 0.11274573 0.3566740
## P53_PATHWAY                         0.11676611 0.3566740
## E2F_TARGETS                         0.11703767 0.3566740
## IL6_JAK_STAT3_SIGNALING             0.12870020 0.3566740
## FATTY_ACID_METABOLISM               0.13375804 0.3566740
## KRAS_SIGNALING_UP                   0.13988853 0.3566740
## DNA_REPAIR                          0.15004059 0.3566740
## MYC_TARGETS_V1                      0.15209394 0.3566740
## SPERMATOGENESIS                     0.15402406 0.3566740
## HEME_METABOLISM                     0.16080727 0.3566740
## APOPTOSIS                           0.16343028 0.3566740
## ADIPOGENESIS                        0.17038535 0.3566740
## G2M_CHECKPOINT                      0.17155957 0.3566740
## OXIDATIVE_PHOSPHORYLATION           0.17280831 0.3566740
## KRAS_SIGNALING_DN                   0.17877775 0.3566740
## MYOGENESIS                          0.17973694 0.3566740
## APICAL_JUNCTION                     0.19076816 0.3566740
## UV_RESPONSE_UP                      0.20610861 0.3566740
## UNFOLDED_PROTEIN_RESPONSE           0.20906746 0.3566740
## COAGULATION                         0.21054311 0.3566740
## EPITHELIAL_MESENCHYMAL_TRANSITION   0.21934188 0.3566740
## XENOBIOTIC_METABOLISM               0.22499811 0.3566740
## GLYCOLYSIS                          0.22989067 0.3566740
## PROTEIN_SECRETION                   0.23540487 0.3566740
## MITOTIC_SPINDLE                     0.24732601 0.3621172
## PEROXISOME                          0.25760465 0.3621172
## PI3K_AKT_MTOR_SIGNALING             0.26072436 0.3621172
## ESTROGEN_RESPONSE_LATE              0.27065963 0.3657563
## REACTIVE_OXIGEN_SPECIES_PATHWAY     0.29382781 0.3789689
## HYPOXIA                             0.30760921 0.3789689
## BILE_ACID_METABOLISM                0.31055062 0.3789689
## MTORC1_SIGNALING                    0.31075447 0.3789689
## UV_RESPONSE_DN                      0.33700085 0.4011915
## ANGIOGENESIS                        0.35220166 0.4047468
## ANDROGEN_RESPONSE                   0.35617719 0.4047468
## ESTROGEN_RESPONSE_EARLY             0.36628246 0.4069805
## MYC_TARGETS_V2                      0.37701893 0.4098032
## PANCREAS_BETA_CELLS                 0.39298923 0.4180736
## TGF_BETA_SIGNALING                  0.41119435 0.4205139
## HEDGEHOG_SIGNALING                  0.41210367 0.4205139
## CHOLESTEROL_HOMEOSTASIS             0.62892117 0.6289212

Network Analysis

Hallmark

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigHallmark <-
  fgseaHallmarkTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysHallmark <- names(sigHallmark) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesHallmark <- unique(unlist(sigHallmark)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesHallmark <- full_join(pathwaysHallmark, genesHallmark, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesHallmark <- ldply(sigHallmark, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesHallmark, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesHallmark, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyHallmark <- 
  tbl_graph(nodes = nodesHallmark, edges = edgesHallmark, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaHallmarkTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDE$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTable$Geneid ~ 
        as.integer(row_number(label %in% topTable$Geneid)), 
      id <= nrow(fgseaHallmarkTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaHallmarkTop) ~ rainbow(nrow(fgseaHallmarkTop))[id],
      label %in% topTableDE$Geneid ~ "black"),
    hjust = case_when(
      DE == "ugt5b4" ~ as.integer(0)),
    vjust = case_when(DE == "ugt5b4" ~ as.integer(5))
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaHallmarkTop) ~ rainbow(nrow(fgseaHallmarkTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyHallmark, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaHallmarkTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour), 
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour),
    shape = 21,
    stroke = 0.5, 
    
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways),
    repel = TRUE,
    size = 3, 
    alpha = 0.7,
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE, hjust = hjust, vjust = vjust), 
    repel = TRUE,
    size = 3, 
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

KEGG

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigKEGG <- 
  fgseaKEGGTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>%
  lapply(unlist)
# Create a node list
pathwaysKEGG <- names(sigKEGG) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesKEGG <- unique(unlist(sigKEGG)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesKEGG <- full_join(pathwaysKEGG, genesKEGG, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesKEGG <- ldply(sigKEGG, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesKEGG, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesKEGG, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyKEGG <- 
  tbl_graph(nodes = nodesKEGG, edges = edgesKEGG, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaKEGGTop) ~ label
    ),
    DE = case_when(label %in% topTableDE$Geneid ~ symbol),
    size = case_when(
      label %in% topTable$Geneid ~ row_number(label %in% topTable$Geneid), 
      id <= nrow(fgseaKEGGTop) ~ 4000L
    ) %>% as.integer(),
    colour = case_when(
      id <= nrow(fgseaKEGGTop) ~ rainbow(nrow(fgseaKEGGTop))[id],
      label %in% topTableDE$Geneid ~ "black"
    ),
    hjust = case_when(
      DE == "ugt5b4" ~ -1L,
      DE == "blvrb" ~ 7L
    ),
    vjust = case_when(
      DE == "ugt5b4" ~ 7L,
      DE == "blvrb" ~ 0L
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaKEGGTop) ~ rainbow(nrow(fgseaKEGGTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(26)
# Plot graph
ggraph(tidyKEGG, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaKEGGTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour), 
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, 
    stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways),
    repel = TRUE, 
    size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE, hjust = hjust, vjust = vjust), 
    repel = TRUE,
    size = 3,
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

WikiPathways

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigWiki <- 
  fgseaWikiTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysWiki <- names(sigWiki) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesWiki <- unique(unlist(sigWiki)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesWiki <- full_join(pathwaysWiki, genesWiki, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesWiki <- ldply(sigWiki, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesWiki, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesWiki, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyWiki <- 
  tbl_graph(nodes = nodesWiki, edges = edgesWiki, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaWikiTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDE$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTable$Geneid ~ 
        as.integer(row_number(label %in% topTable$Geneid)), 
      id <= nrow(fgseaWikiTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaWikiTop) ~ rainbow(nrow(fgseaWikiTop))[id],
      label %in% topTableDE$Geneid ~ "black"
    ),
    hjust = case_when(
      DE == "ugt5b4" ~ as.integer(1),
      DE == "blvrb" ~ as.integer(2)
    ),
    vjust = case_when(
      DE == "ugt5b4" ~ as.integer(7),
      DE == "blvrb" ~ as.integer(7)
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaWikiTop) ~ rainbow(nrow(fgseaWikiTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(27)
# Plot graph
ggraph(tidyWiki, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaWikiTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour), 
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, 
    stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE, hjust = hjust, vjust = vjust), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

RUVSeq

k = 1

# Extract subset of low expressed genes from DE analysis to act as negative controls for RUVg procedure
negControl <- topTable %>% 
  dplyr::arrange(desc(P.Value)) %>%
  .[1:10000,] %>%
  .$Geneid
# Run RUVSeq
RUVg <- RUVg(dgeFilt$counts, negControl, 1)

PCA

# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse <- dgeFilt
# Replace with results
dgeFalse$counts <- RUVg$normalizedCounts
# Run PCA function
pcaRUVg <- dgeFalse %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 20.11 18.01 16.48 15.75 14.34 13.36 12.55 11.28 7.413e-14
Proportion of Variance 0.2109 0.1691 0.1417 0.1294 0.1073 0.09307 0.0822 0.06639 0
Cumulative Proportion 0.2109 0.38 0.5217 0.6511 0.7583 0.8514 0.9336 1 1
# Plot PCA
pcaRUVg$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFalse$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

DE Analysis

# Create copy of DGE List 
dgeRUVg <- dgeFilt
# Bind W_1 from RUVSeq analysis to $samples 
dgeRUVg$samples %<>% cbind(RUVg$W)
# Create design matrix
design <- model.matrix(~group + W_1, data = dgeRUVg$samples)
# Perform DE analysis
topTableRUVg <- estimateGLMCommonDisp(dgeRUVg, design) %>%
  estimateGLMTagwiseDisp(design) %>%
  glmFit(design) %>%
  glmLRT(coef=2) %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", ID.start, ID.end, sep = "-") %>%
  unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
  dplyr::select(
    Geneid = ID.gene_id, 
    Symbol = ID.gene_name,
    AveExpr = logCPM, logFC, 
    P.Value = PValue, 
    FDR, Location, 
    Entrez = ID.entrezid
  ) %>%
  mutate(DE = FDR < 0.05)
# Summary of DE genes
topTableRUVg %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>% 
  pander(style = "rmarkdown", split.tables = Inf)
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000091368 AL954327.1 2.655 -5.873 1.759e-12 3.229e-08
ENSDARG00000093214 si:ch211-284e13.9 0.8192 1.567 4.056e-10 3.722e-06
ENSDARG00000017246 prx 2.325 -2.652 1.009e-09 6.175e-06
ENSDARG00000087732 RF00017 2.174 1.108 9.402e-09 4.314e-05
ENSDARG00000058342 grhl2a 2.302 0.9018 3.524e-08 0.0001287
ENSDARG00000024789 mxc 1.82 2.513 4.208e-08 0.0001287
ENSDARG00000091916 ugt5b4 -0.03816 -1.437 6.489e-08 0.0001668
ENSDARG00000036304 dnaaf3l 1.358 -1.439 7.269e-08 0.0001668
ENSDARG00000069961 il21r.1 2.169 0.7942 2.494e-07 0.0005085
ENSDARG00000042577 batf3 0.437 1.155 6.085e-07 0.00103
ENSDARG00000096829 blvrb 3.025 -1.521 6.175e-07 0.00103
ENSDARG00000044129 si:ch211-214p16.1 3.486 0.8206 1.357e-06 0.002076
ENSDARG00000061717 cytip 0.24 1.121 2.899e-06 0.003979
ENSDARG00000041923 ccl38.6 1.874 1.499 3.035e-06 0.003979
ENSDARG00000021794 hmmr 1.117 0.8409 4.282e-06 0.005094
ENSDARG00000097551 si:dkey-7i4.15 2.126 0.9011 4.441e-06 0.005094
ENSDARG00000101112 FO704777.1 1.483 -0.9218 4.769e-06 0.005149
ENSDARG00000002768 pvalb2 2.238 -3.284 5.273e-06 0.005376
ENSDARG00000074212 SLC5A10 0.5022 1.075 5.575e-06 0.005385
ENSDARG00000071499 cxcl32b.1 0.3599 1.442 6.655e-06 0.005978
ENSDARG00000041917 ccl38a.4 0.5656 2.237 6.84e-06 0.005978
ENSDARG00000024877 ptgr1 4.625 2.336 7.44e-06 0.006004
ENSDARG00000103456 cyp2aa11 1.456 -0.8222 7.524e-06 0.006004
ENSDARG00000089477 si:ch211-132g1.3 5.855 0.6167 7.862e-06 0.006012
ENSDARG00000037378 qdprb1 3.965 1.033 9.868e-06 0.007117
ENSDARG00000071437 ptprc 4.363 0.7584 1.027e-05 0.007117
ENSDARG00000086998 zgc:64002 1.376 0.7935 1.047e-05 0.007117
ENSDARG00000116047 CU459094.3 1.674 1.612 1.093e-05 0.007166
ENSDARG00000089382 zgc:158463 5.632 0.6575 1.285e-05 0.008135
ENSDARG00000091548 stard9 3.837 -0.8309 1.432e-05 0.008758
ENSDARG00000105263 ccl36.1 2.253 1.51 1.588e-05 0.009399
ENSDARG00000037421 egr1 8.527 -0.7054 1.672e-05 0.009591
ENSDARG00000062817 crym 1.656 1.335 2.161e-05 0.01202
ENSDARG00000017624 krt4 3.777 -2.333 2.399e-05 0.01263
ENSDARG00000074989 sparcl1 2.651 0.7409 2.436e-05 0.01263
ENSDARG00000038668 gbp1 2.923 1.433 2.478e-05 0.01263
ENSDARG00000068457 tnnt3b 1.893 -2.269 2.583e-05 0.01281
ENSDARG00000102558 pde6ha 2.218 1.489 3.64e-05 0.01758
ENSDARG00000111436 CR933528.1 1.123 -1.54 3.878e-05 0.01825
ENSDARG00000014657 zgc:171731 0.9471 0.8961 4.08e-05 0.01872
ENSDARG00000036189 spata4 1.13 0.9143 4.398e-05 0.01969
ENSDARG00000099197 actc1b 3.643 -2.355 6.207e-05 0.02712
ENSDARG00000035327 ckma 2.35 -2.719 7.182e-05 0.03048
ENSDARG00000058774 CABZ01053221.1 0.1135 0.973 7.306e-05 0.03048
ENSDARG00000101767 si:dkey-183i3.6 0.4311 1.2 7.581e-05 0.03092
ENSDARG00000067990 myhz1.1 2.622 -2.821 8.088e-05 0.03227
ENSDARG00000092136 cenpw 0.7333 0.7699 8.406e-05 0.03233
ENSDARG00000086418 si:ch211-236p5.3 4.007 -0.8025 8.798e-05 0.03233
ENSDARG00000077799 egr4 6.761 -0.9616 8.886e-05 0.03233
ENSDARG00000093438 CU467110.1 4.596 0.5512 8.893e-05 0.03233
ENSDARG00000101495 ugt5b2 0.7437 -1.399 9.262e-05 0.03233
ENSDARG00000098293 si:dkey-27i16.2 4.548 0.5004 9.454e-05 0.03233
ENSDARG00000070579 ggact.3 2.671 -6.26 9.48e-05 0.03233
ENSDARG00000096541 BX901942.1 0.815 0.9853 9.512e-05 0.03233
ENSDARG00000106991 CR388373.3 1.651 -0.7474 9.743e-05 0.03251
ENSDARG00000114910 FO704772.1 6.139 0.5599 0.0001005 0.03295
ENSDARG00000086223 si:ch73-144d13.4 1.873 -0.7772 0.0001138 0.03665
ENSDARG00000091627 si:dkey-271j15.3 0.2788 -1.479 0.0001203 0.03734
ENSDARG00000013524 cyp2ae1 0.3343 -0.9528 0.0001219 0.03734
ENSDARG00000097234 AL929435.1 0.4823 -0.9944 0.0001221 0.03734
ENSDARG00000077474 pla2r1 2.495 -0.6125 0.0001244 0.03743
ENSDARG00000017653 rgs13 1.278 0.8592 0.0001297 0.03841
ENSDARG00000028507 itgb4 4.24 -0.4763 0.0001351 0.03919
ENSDARG00000092062 BX511123.2 1.631 0.8145 0.0001374 0.03919
ENSDARG00000089004 si:ch211-207e14.4 2.794 -0.6261 0.0001388 0.03919
ENSDARG00000002830 trmt2a 3.068 0.5274 0.0001514 0.04211
ENSDARG00000073952 slc4a7 4.765 -0.5462 0.0001577 0.04321
ENSDARG00000095048 si:dkey-250k15.7 1.212 -2.731 0.0001611 0.04349
ENSDARG00000044775 fut7 0.6278 0.915 0.0001664 0.04426
ENSDARG00000061006 cyb5d2 2.749 1.627 0.0001704 0.04462
ENSDARG00000042350 syt1b 2.538 -0.701 0.0001726 0.04462
ENSDARG00000040565 ckmb 2.633 -2.686 0.0001879 0.04789
ENSDARG00000094984 si:ch211-197g15.8 1.448 0.7479 0.0001936 0.04867
ENSDARG00000041864 capn3a 0.1472 -0.993 0.0001978 0.04906
ENSDARG00000010729 CABZ01073795.1 3.718 1.727 0.0002005 0.04906
ENSDARG00000055752 npas4a 7.974 -0.8624 0.0002053 0.0492
ENSDARG00000110160 SBSPON 0.9213 -0.8867 0.0002064 0.0492

2017 data analysis

Differential Expression Analysis

Load data

kalCounts <- read_rds("nhiData/6month_Normoxia.rds")
dgeListNhi <- kalCounts$counts %>%
  as.data.frame() %>%
  rownames_to_column("tx_id") %>%
  as_tibble() %>%
  set_colnames(basename(colnames(.))) %>%
  mutate(tx_id = str_remove_all(tx_id, "\\.[0-9]*$")) %>%
  left_join(trans2Gene) %>%
  group_by(gene_id) %>%
  summarise_if(
    .predicate = is.numeric,
    .funs = sum
    ) %>%
  as.data.frame() %>%
  column_to_rownames("gene_id") %>%
  round() %>%
  set_colnames(str_remove(colnames(.), "[0-9]_MORGAN_")) %>%
  set_colnames(str_remove(colnames(.), "PN")) %>%
  set_colnames(str_remove(colnames(.), "_[RS].+")) %>%
  set_colnames(str_replace(colnames(.), "6P", "6W")) %>%
  DGEList(
    group = colnames(.) %>%
      str_replace_all("(6)([WQ])(.+)", "\\2") %>%
      str_replace_all("W", "WT") %>%
      str_replace_all("Q", "Mut") %>%
      factor(levels = c("WT", "Mut")),
    genes = genesGR[rownames(.)]
  ) %>%
  calcNormFactors() 

Data QC

# Perform logical test to see how many genes were not detected in dataset
dgeListNhi$counts %>%
  rowSums() %>%
  is_greater_than(0) %>%
  table()
## .
## FALSE  TRUE 
##  1282 24299
# Check for genes having >= 4 samples with cpm > 1
dgeListNhi %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4) %>%
  table()
## .
## FALSE  TRUE 
##  7049 18532
# Create logical vector of genes to keep that fit criteria
genes2keepNhi <- dgeListNhi %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFiltNhi <- dgeListNhi[genes2keepNhi,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeListNhi %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "Before Filtering")
dgeFiltNhi %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "After Filtering")

par(mfrow = c(1,1))

Library sizes

# Check library sizes with box plot
dgeFiltNhi$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library Size") +
  scale_fill_discrete(
    name ="Genotype", 
    labels = c("Wildtype","Mutant")
  ) +
  scale_x_discrete(labels=c("w" = "Wildtype", "q" = "Mutant")) +
  theme_bw()

PCA

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pcaNhi <- dgeFiltNhi %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaNhi)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 18.02 16.09 15.83 14.74 14.21 13.33 13.09 4.364e-14
Proportion of Variance 0.2026 0.1616 0.1565 0.1356 0.1261 0.1108 0.1069 0
Cumulative Proportion 0.2026 0.3642 0.5206 0.6562 0.7823 0.8931 1 1
# Plot PCA
pcaNhi$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFiltNhi$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

Differential expression

# Create model matrix
design <- model.matrix(~group, data = dgeFiltNhi$samples)
# Perform exact test on DGEList
topTableNhi <- dgeFiltNhi %>%
  estimateDisp(design = design) %>%
  exactTest() %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", start, end, sep = "-") %>%
  unite("Location", seqnames, Range, strand, sep = ":") %>%
  dplyr::select(Geneid = gene_id, 
                Symbol = gene_name,
                AveExpr = logCPM, logFC, 
                P.Value = PValue, 
                FDR, Location, 
                Entrez = entrezid) %>%
  mutate(DE = FDR < 0.05)
# Volcano plot showing DE genes
topTableNhi %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(data = . %>% 
                    dplyr::filter(DE) %>%
                    dplyr::filter(-log10(P.Value) > 4 | abs(logFC) >                        2.5), aes(label = Symbol)) + 
  scale_colour_manual(values = c("grey", "red")) +
  theme_bw() +
  theme(legend.position = "none")

# MD Plot showing DE genes
topTableNhi %>%
  dplyr::arrange(desc(P.Value)) %>%
  ggplot(aes(AveExpr, logFC, colour = DE)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(data = . %>% 
                    dplyr::filter(DE) %>%
                    dplyr::filter(abs(logFC) > 2 | AveExpr > 14),
                  aes(label = Symbol)) + 
  scale_colour_manual(values = c("grey", "red")) +
  labs(x = "Average Expression (log2 CPM)",
       y = "log Fold-Change") +
  theme_bw() +
  theme(legend.position = "none")

# Summary of DE genes
topTableDENhi <- topTableNhi %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) 
topTableDENhi %>% pander(style = "rmarkdown", split.tables = Inf)
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000087290 si:ch211-202h22.10 0.6292 5.598 9.788e-23 1.814e-18
ENSDARG00000077567 angel1 3.237 -0.7363 7.88e-09 7.301e-05
ENSDARG00000058270 COLGALT1 (1 of many) 5.742 0.5835 2.138e-07 0.001182
ENSDARG00000102796 si:ch1073-83b15.1 1.702 1.163 2.612e-07 0.001182
ENSDARG00000101407 tgm1l4 0.205 -1.664 3.188e-07 0.001182
ENSDARG00000092604 si:ch211-15j1.4 4.745 1.068 2.306e-06 0.007022
ENSDARG00000074266 si:ch1073-314i13.4 6.159 -0.437 3.105e-06 0.007022
ENSDARG00000103727 tfr2 0.8383 -1.198 3.348e-06 0.007022
ENSDARG00000035871 rpl30 6.328 -0.295 3.41e-06 0.007022
ENSDARG00000098688 CABZ01101739.1 3.57 0.5685 6.214e-06 0.01152
ENSDARG00000045248 h3f3d 6.362 -0.3339 1.18e-05 0.01988
ENSDARG00000104839 mansc1 5.318 0.4417 1.694e-05 0.02616
ENSDARG00000075759 samd4a 6.597 0.3786 2.113e-05 0.02805
ENSDARG00000091209 ucp3 0.5011 -1.314 2.119e-05 0.02805
ENSDARG00000094408 MFAP4 (1 of many) 0.1877 -1.82 2.519e-05 0.03112
ENSDARG00000032005 ccdc65 2.008 0.7997 3.23e-05 0.03741
ENSDARG00000096375 march7 6.999 0.3083 3.59e-05 0.03913
ENSDARG00000024847 col5a2b 1.45 -0.9456 4.18e-05 0.04112
ENSDARG00000092807 RPL41 6.716 -0.2908 4.215e-05 0.04112
ENSDARG00000091025 znf1124 0.816 1.289 5.094e-05 0.0472

Gene Set Enrichment Analysis (GSEA)

# Create named vector of gene level statistics 
ranksNhi <- topTableNhi %>%
  mutate(stat = -sign(logFC) * log10(P.Value)) %>%
  dplyr::arrange(desc(stat)) %>%
  with(structure(stat, names = Geneid))

Hallmark

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaHallmarkNhi <- fgsea(hallmark, ranksNhi, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaHallmarkNhiTop <- fgseaHallmarkNhi %>%
  dplyr::filter(padj < 0.05)
fgseaHallmarkNhiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Hallmark pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 4 most significantly enriched Hallmark pathways. This corresponds to an FDR of 0.780%
pathway pval FDR ES NES size padj
OXIDATIVE_PHOSPHORYLATION 2.011e-05 0.001006 -0.4286 -1.738 222 0.001006
MTORC1_SIGNALING 0.0001007 0.002517 -0.4053 -1.648 229 0.005035
MYC_TARGETS_V1 0.0005627 0.007796 -0.3841 -1.557 221 0.02813
XENOBIOTIC_METABOLISM 0.0006237 0.007796 -0.3829 -1.553 223 0.03118
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(hallmark[fgseaHallmarkNhiTop$pathway], 
              ranksNhi, fgseaHallmarkNhi, gseaParam = 0.5)

KEGG

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaKEGGNhi <- fgsea(kegg, ranksNhi, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaKEGGNhiTop <- fgseaKEGGNhi %>%
  dplyr::filter(padj < 0.05)
fgseaKEGGNhiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched KEGG pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 8 most significantly enriched KEGG pathways. This corresponds to an FDR of 0.0924%
pathway pval FDR ES NES size padj
FATTY_ACID_METABOLISM 1.985e-05 0.0005352 -0.705 -2.447 73 0.003692
TYROSINE_METABOLISM 1.986e-05 0.0005352 -0.7169 -2.447 66 0.003694
RIBOSOME 1.988e-05 0.0005352 -0.8126 -2.87 81 0.003698
GLYCOLYSIS_GLUCONEOGENESIS 1.993e-05 0.0005352 -0.6286 -2.271 94 0.003708
METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 2.014e-05 0.0005352 -0.4192 -1.738 274 0.003745
DRUG_METABOLISM_CYTOCHROME_P450 2.014e-05 0.0005352 -0.4515 -1.867 268 0.003746
RETINOL_METABOLISM 2.014e-05 0.0005352 -0.4822 -1.997 273 0.003746
PEROXISOME 3.973e-05 0.0009238 -0.5632 -1.981 79 0.00739
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(kegg[fgseaKEGGNhiTop$pathway], 
              ranksNhi, fgseaKEGGNhi, gseaParam = 0.5)

Wikipathways

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaWikiNhi <- fgsea(wiki, ranksNhi, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaWikiNhiTop <- fgseaWikiNhi %>%
  dplyr::filter(padj < 0.05)
fgseaWikiNhiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Wiki pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 3 most significantly enriched Wiki pathways. This corresponds to an FDR of 0.350%
pathway pval FDR ES NES size padj
Cytoplasmic Ribosomal Proteins 1.995e-05 0.003503 -0.8051 -2.838 81 0.01045
Fatty Acid Omega Oxidation 2.003e-05 0.003503 -0.8832 -2.609 32 0.0105
Ethanol effects on histone modifications 2.006e-05 0.003503 -0.6779 -2.088 39 0.01051
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(wiki[fgseaWikiNhiTop$pathway], 
              ranksNhi, fgseaWikiNhi, gseaParam = 0.5)

Network Analysis

Hallmark

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigHallmarkNhi <-
  fgseaHallmarkNhiTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysHallmarkNhi <- names(sigHallmarkNhi) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesHallmarkNhi <- unique(unlist(sigHallmarkNhi)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesHallmarkNhi <- full_join(pathwaysHallmarkNhi, genesHallmarkNhi, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesHallmarkNhi <- ldply(sigHallmarkNhi, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesHallmarkNhi, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesHallmarkNhi, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyHallmarkNhi <- 
  tbl_graph(nodes = nodesHallmarkNhi, 
            edges = edgesHallmarkNhi, 
            directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaHallmarkNhiTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDENhi$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTableNhi$Geneid ~ 
        as.integer(row_number(label %in% topTableNhi$Geneid)), 
      id <= nrow(fgseaHallmarkNhiTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaHallmarkNhiTop) ~ rainbow(nrow(fgseaHallmarkNhiTop))[id],
      label %in% topTableDENhi$Geneid ~ "black"
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaHallmarkNhiTop) ~ 
        rainbow(nrow(fgseaHallmarkNhiTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyHallmarkNhi, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaHallmarkNhiTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour),
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways), 
    repel = TRUE, size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

KEGG

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigKEGGNhi <-
  fgseaKEGGNhiTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysKEGGNhi <- names(sigKEGGNhi) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesKEGGNhi <- unique(unlist(sigKEGGNhi)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesKEGGNhi <- full_join(pathwaysKEGGNhi, genesKEGGNhi, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesKEGGNhi <- ldply(sigKEGGNhi, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesKEGGNhi, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesKEGGNhi, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyKEGGNhi <- 
  tbl_graph(nodes = nodesKEGGNhi, edges = edgesKEGGNhi, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaKEGGNhiTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDENhi$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTableNhi$Geneid ~ 
        as.integer(row_number(label %in% topTableNhi$Geneid)), 
      id <= nrow(fgseaKEGGNhiTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaKEGGNhiTop) ~ rainbow(nrow(fgseaKEGGNhiTop))[id],
      label %in% topTableDENhi$Geneid ~ "black"
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaKEGGNhiTop) ~ rainbow(nrow(fgseaKEGGNhiTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyKEGGNhi, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaKEGGNhiTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour), 
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, 
    stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

Wikipathways

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigWikiNhi <-
  fgseaWikiNhiTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysWikiNhi <- names(sigWikiNhi) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesWikiNhi <- unique(unlist(sigWikiNhi)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesWikiNhi <- full_join(pathwaysWikiNhi, genesWikiNhi, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesWikiNhi <- ldply(sigWikiNhi, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesWikiNhi, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesWikiNhi, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyWikiNhi <- 
  tbl_graph(nodes = nodesWikiNhi, edges = edgesWikiNhi, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaWikiNhiTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDENhi$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTableNhi$Geneid ~ 
        as.integer(row_number(label %in% topTableNhi$Geneid)), 
      id <= nrow(fgseaWikiNhiTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaWikiNhiTop) ~ rainbow(nrow(fgseaWikiNhiTop))[id],
      label %in% topTableDENhi$Geneid ~ "black"
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaWikiNhiTop) ~ rainbow(nrow(fgseaWikiNhiTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyWikiNhi, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaWikiNhiTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour),
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, 
    stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.8, 
    colour = "black") +
  theme_graph() +
  theme(legend.position = "none")

RUVSeq

k = 1

# Extract subset of low expressed genes from DE analysis to act as negative controls for RUVg procedure
negControlNhi <- topTableNhi %>% 
  dplyr::arrange(desc(P.Value)) %>%
  .[1:10000,] %>%
  .$Geneid
# Run RUVSeq
RUVgNhi <- RUVg(dgeFiltNhi$counts, negControlNhi, 1)

PCA

# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalseNhi <- dgeFiltNhi
# Replace with results
dgeFalse$counts <- RUVgNhi$normalizedCounts
# Run PCA function
pcaRUVgNhi <- dgeFalseNhi %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVgNhi)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 18.02 16.09 15.83 14.74 14.21 13.33 13.09 4.364e-14
Proportion of Variance 0.2026 0.1616 0.1565 0.1356 0.1261 0.1108 0.1069 0
Cumulative Proportion 0.2026 0.3642 0.5206 0.6562 0.7823 0.8931 1 1
# Plot PCA
pcaRUVgNhi$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFalseNhi$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

DE Analysis

# Create copy of DGE List 
dgeRUVgNhi <- dgeFiltNhi
# Bind W_1 from RUVSeq analysis to $samples 
dgeRUVgNhi$samples %<>% cbind(RUVgNhi$W)
# Create design matrix
design <- model.matrix(~group + W_1, data = dgeRUVgNhi$samples)
# Perform DE analysis
topTableRUVgNhi <- estimateGLMCommonDisp(dgeRUVgNhi, design) %>%
  estimateGLMTagwiseDisp(design) %>%
  glmFit(design) %>%
  glmLRT(coef=2) %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", start, end, sep = "-") %>%
  unite("Location", seqnames, Range, strand, sep = ":") %>%
  dplyr::select(
    Geneid = gene_id, 
    Symbol = gene_name,
    AveExpr = logCPM, logFC, 
    P.Value = PValue, 
    FDR, Location, 
    Entrez = entrezid
  ) %>%
  mutate(DE = FDR < 0.05)
# Summary of DE genes
topTableRUVgNhi %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>% 
  pander(style = "rmarkdown", split.tables = Inf)
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000087290 si:ch211-202h22.10 0.6301 4.881 9.265e-26 1.717e-21
ENSDARG00000091253 smyd1b 0.6732 2.6 6.69e-13 6.199e-09
ENSDARG00000102796 si:ch1073-83b15.1 1.702 1.388 1.409e-09 8.703e-06
ENSDARG00000094408 MFAP4 (1 of many) 0.1876 -2.021 4.623e-09 2.142e-05
ENSDARG00000103727 tfr2 0.8382 -1.383 6.576e-08 0.0002437
ENSDARG00000077567 angel1 3.237 -0.7859 1.66e-07 0.0004883
ENSDARG00000025320 col21a1 0.126 1.912 1.844e-07 0.0004883
ENSDARG00000075974 si:dkey-258f14.3 2.847 -1.485 2.153e-07 0.0004987
ENSDARG00000101918 si:ch211-111e20.1 3.081 5.155 3.972e-07 0.0008178
ENSDARG00000101407 tgm1l4 0.2047 -1.65 5.591e-07 0.001036
ENSDARG00000058270 COLGALT1 (1 of many) 5.742 0.6908 6.344e-07 0.001069
ENSDARG00000061177 mov10b.1 1.191 1.294 7.145e-07 0.001103
ENSDARG00000078246 si:ch211-114l13.4 2.125 1.157 7.745e-07 0.001104
ENSDARG00000088745 MFAP4 (1 of many) 1.741 1.103 1.151e-06 0.001524
ENSDARG00000104214 CR381540.3 1.666 -1.075 2.278e-06 0.002814
ENSDARG00000091025 znf1124 0.8163 1.463 2.515e-06 0.002913
ENSDARG00000015337 comta 3.141 -0.7437 3.274e-06 0.003447
ENSDARG00000092604 si:ch211-15j1.4 4.745 1.169 3.348e-06 0.003447
ENSDARG00000091859 si:ch73-233m11.2 1.447 -1.292 4.1e-06 0.003999
ENSDARG00000094297 si:dkey-222h21.2 1.605 0.9738 8.059e-06 0.007467
ENSDARG00000113348 FO904868.1 2.218 1.226 9.293e-06 0.008201
ENSDARG00000100132 CU929402.1 3.621 0.6179 1.026e-05 0.008646
ENSDARG00000041699 piwil1 0.2241 -1.306 1.122e-05 0.00904
ENSDARG00000024847 col5a2b 1.45 -1.015 1.199e-05 0.009261
ENSDARG00000093531 slc37a4b 0.4143 2.907 1.363e-05 0.0101
ENSDARG00000093484 si:dkey-95p16.2 2.553 -0.7538 1.531e-05 0.01091
ENSDARG00000086374 isg15 0.4439 1.47 1.791e-05 0.01229
ENSDARG00000088740 hmgcll1 2.723 1.068 2.093e-05 0.01385
ENSDARG00000100698 eva1bb 2.406 -1.265 2.216e-05 0.01416
ENSDARG00000097292 slc26a6 0.5613 1.652 2.386e-05 0.01474
ENSDARG00000095717 si:dkey-172k15.6 2.068 1.048 3.588e-05 0.02145
ENSDARG00000028912 si:dkey-10h3.2 1.45 -1.079 4.404e-05 0.0255
ENSDARG00000063162 cmtm8b 2.698 0.7116 5.519e-05 0.03099
ENSDARG00000046091 si:ch211-283g2.1 2.825 2.389 6.376e-05 0.03475
ENSDARG00000113526 BX548011.3 7.426 -1.056 7.038e-05 0.03727
ENSDARG00000073948 prok1 1.758 3.299 7.623e-05 0.03924
ENSDARG00000093650 si:ch211-287n14.3 2.653 -0.8693 8.107e-05 0.03951
ENSDARG00000099075 CABZ01032982.1 0.7033 1.081 8.149e-05 0.03951
ENSDARG00000068453 CABZ01112732.1 0.2205 2.332 8.314e-05 0.03951

Integration of 2019 data with 2017 data

Set up DGE List and QC data

# Combine counts into the same DGE List
# First extract counts from 2017 DGE List
countsNhi <- dgeListNhi$counts %>%
  as.data.frame() %>%
  rownames_to_column("Geneid") %>%
  as_tibble()
# Then join with 2019 counts
dgeListComb <- full_join(counts, countsNhi) %>%
  replace(is.na(.), 0) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
# Set group variable
dgeListComb$samples$group <- colnames(dgeListComb) %>%
  str_extract("(W|Q)") %>%
  factor(levels = c("W", "Q"))
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeListComb$genes <- genesGR[rownames(dgeListComb),]
# Create logical vector of genes to keep that fit criteria
genes2keepComb <- dgeListComb %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(8)
# Create new DGEList of genes fitting criteria
dgeFiltComb <- dgeListComb[genes2keepComb,, 
                                   keep.lib.sizes = FALSE] %>%
  calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeListComb %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "Before Filtering")
dgeFiltComb %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "After Filtering")

par(mfrow = c(1,1))

LogFC Comparison

# Extract logFC from topTables 
logFC <- topTable %>%
  dplyr::select(Geneid, Symbol, logFCNhi = logFC)
logFCNhi <- topTableNhi %>%
  dplyr::select(Geneid, Symbol, logFC)
# Join tables
logFCComb <- full_join(logFC, logFCNhi) %>%
  na.omit() 
# Plot logFC graph
logFCComb %>%
  ggplot(aes(logFC,logFCNhi)) +
  geom_point() +
  geom_text_repel(
    aes(label = Symbol),
    data = . %>% 
      dplyr::filter(abs(logFC) > 2 | abs(logFCNhi) > 2) 
    ) +
  geom_abline(slope = 1, intercept = 0, col = "blue") +
  geom_vline(xintercept = c(-1, 1), linetype = 2, colour = "grey50") +
  geom_hline(yintercept = c(-1, 1), linetype = 2, colour = "grey50") +
  labs(title = "LogFC comparison of q96k97del between 2017 and 2019 datasets") +
  xlab("2019") + ylab("2017") +
  xlim(-4, 6) + ylim(-4, 6) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Meta Analysis

Hallmark

# Extract p-values from 2017 and 2019 GSEA results and combine using Fisher's method
p2017Hallmark <- fgseaHallmarkNhi %>%
  dplyr::select(pathway, pval) %>%
  dplyr::mutate(pval = log10(pval)) %>%
  dplyr::rename(log10p2017 = pval)
p2019Hallmark <- fgseaHallmark %>%
  dplyr::select(pathway, pval) %>%
  dplyr::mutate(pval = log10(pval)) %>%
  dplyr::rename(log10p2019 = pval)
fgseaHallmarkMeta <- full_join(p2017Hallmark, p2019Hallmark) %>%
  replace(is.na(.), 0) %>%
  dplyr::mutate(
    chiSquare = -2 * (log10p2017 + log10p2019),
    pCombined = pchisq(chiSquare, df = 4, lower.tail = FALSE),
    FDR = p.adjust(pCombined, "fdr")
  ) %>%
  dplyr::arrange(pCombined)

KEGG

# Extract p-values from 2017 and 2019 GSEA results and combine using Fisher's method
p2017KEGG <- fgseaKEGGNhi %>%
  dplyr::select(pathway, pval) %>%
  dplyr::mutate(pval = log10(pval)) %>%
  dplyr::rename(log10p2017 = pval)
p2019KEGG <- fgseaKEGG %>%
  dplyr::select(pathway, pval) %>%
  dplyr::mutate(pval = log10(pval)) %>%
  dplyr::rename(log10p2019 = pval)
fgseaKEGGMeta <- full_join(p2017KEGG, p2019KEGG) %>%
  replace(is.na(.), 0) %>%
  dplyr::mutate(
    chiSquare = -2 * (log10p2017 + log10p2019),
    pCombined = pchisq(chiSquare, df = 4, lower.tail = FALSE),
    FDR = p.adjust(pCombined, "fdr")
  ) %>%
  dplyr::arrange(pCombined)

Results

# Hallmark
fgseaHallmarkMeta %>%
  dplyr::filter(FDR < 0.05) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched hallmark pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 1 most significantly enriched hallmark pathways. This corresponds to an FDR of 3.58%
pathway log10p2017 log10p2019 chiSquare pCombined FDR
OXIDATIVE_PHOSPHORYLATION -4.697 -4.908 19.21 0.0007152 0.03576
# KEGG
fgseaKEGGMeta %>%
  dplyr::filter(FDR < 0.05) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched KEGG pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 0 most significantly enriched KEGG pathways. This corresponds to an FDR of -Inf
pathway log10p2017 log10p2019 chiSquare pCombined FDR

PCA

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pcaComb <- dgeFiltComb %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 22.27 18.07 16.75 14.73 14.45 13.34 11.87 11.2 5.671e-14
Proportion of Variance 0.2513 0.1655 0.1421 0.1099 0.1058 0.09023 0.07145 0.06362 0
Cumulative Proportion 0.2513 0.4168 0.559 0.6689 0.7747 0.8649 0.9364 1 1
# Create PCA plots
pca12 <- pcaComb$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
  mutate(
    data = case_when(
      str_detect(.$sample, "_") == TRUE ~ "2017",
      str_detect(.$sample, "_") == FALSE ~ "2019"
    )
  ) %>%
  ggplot(aes(PC1, PC2, colour = data, label = sample, shape = group)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw() +
  theme(legend.position = c(1.2, 0.5))
pca13 <- pcaComb$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC3) %>%
  left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
  mutate(
    data = case_when(
      str_detect(.$sample, "_") == TRUE ~ "2017",
      str_detect(.$sample, "_") == FALSE ~ "2019"
    )
  ) %>%
  ggplot(aes(PC1, PC3, colour = data, label = sample, shape = group)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw() +
  theme(legend.position = "none")
pca23 <- pcaComb$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC2, PC3) %>%
  left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
  mutate(
    data = case_when(
      str_detect(.$sample, "_") == TRUE ~ "2017",
      str_detect(.$sample, "_") == FALSE ~ "2019"
    )
  ) %>%
  ggplot(aes(PC2, PC3, colour = data, label = sample, shape = group)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw() +
  theme(legend.position = "none")
# Set up grid for visualisation of 3 PCA's at once.
vp1 <- viewport(x = 0.25, y = 0.25, width = 0.5, height = 0.5)
vp2 <- viewport(x = 0.25, y = 0.75, width = 0.5, height = 0.5)
vp3 <- viewport(x = 0.75, y = 0.75, width = 0.5, height = 0.5)
# Plot PCA plots in grid
if (interactive()) grid::grid.newpage()
print(pca12, vp = vp1)
print(pca13, vp = vp2)
print(pca23, vp = vp3)

RUVSeq

k = 1

# Get genes that are common between the 10000 least DE genes of each dataset
negControlComb <- intersect(negControl, negControlNhi)
# Run RUVSeq
RUVgComb <- RUVg(dgeFiltComb$counts, negControlComb, 1)

PCA

# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalseComb <- dgeFiltComb
# Replace with results
dgeFalseComb$counts <- RUVgComb$normalizedCounts
# Run PCA function
pcaRUVgComb <- dgeFalseComb %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVgComb)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17
Standard deviation 50.61 40.38 16.45 14.4 12.36 11.96 11.26 10.91 10.54 9.974 9.719 9.27 9.03 8.813 8.683 8.256 8.949e-14
Proportion of Variance 0.4338 0.2761 0.04582 0.0351 0.02588 0.02422 0.02146 0.02016 0.01881 0.01685 0.016 0.01455 0.01381 0.01315 0.01277 0.01154 0
Cumulative Proportion 0.4338 0.7099 0.7557 0.7908 0.8167 0.8409 0.8624 0.8825 0.9013 0.9182 0.9342 0.9487 0.9625 0.9757 0.9885 1 1
# Plot PCA
pcaRUVgComb$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFalseComb$samples, "sample")) %>%
  mutate(
    data = case_when(
      str_detect(.$sample, "_") == TRUE ~ "2017",
      str_detect(.$sample, "_") == FALSE ~ "2019"
    )
  ) %>%
  ggplot(aes(PC1, PC2, colour = data, label = sample, shape = group)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

DE Analysis

# Create copy of DGE List 
dgeRUVgComb <- dgeFiltComb
# Bind W_1 from RUVSeq analysis to $samples 
dgeRUVgComb$samples %<>% cbind(RUVgComb$W)
# Create design matrix
design <- model.matrix(~group + W_1, data = dgeRUVgComb$samples)
# Perform DE analysis
topTableRUVgComb <- estimateGLMCommonDisp(dgeRUVgComb, design) %>%
  estimateGLMTagwiseDisp(design) %>%
  glmFit(design) %>%
  glmLRT(coef=2) %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", ID.start, ID.end, sep = "-") %>%
  unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
  dplyr::select(
    Geneid = ID.gene_id, 
    Symbol = ID.gene_name,
    AveExpr = logCPM, logFC, 
    P.Value = PValue, 
    FDR, Location, 
    Entrez = ID.entrezid
  ) %>%
  mutate(DE = FDR < 0.05)
# Summary of DE genes
topTableRUVgComb %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) %>% 
  pander(style = "rmarkdown", split.tables = Inf)
Geneid Symbol AveExpr logFC P.Value FDR